Beta Regression

Code
library(zoib)
library(tidyverse)
library(GGally)
library(ggpubr)
library(kableExtra)

theme_set(theme_pubr(legend = "bottom"))

# read in raw data
nnns0 <- read.csv("../NNNS_score_data.csv")

Using Haojia’s data cleaning:

Code
# clean up variable names
# remove the dots at the end of variable names
colnames(nnns0) <- gsub("\\.+$", "", colnames(nnns0))
# replace all the dots in variable names with underscores
colnames(nnns0) <- gsub("\\.+", "_", colnames(nnns0))

nnns <- nnns0 |> 
  filter(
    # exclude neurologic or airway anomaly
    Neurologic_Complication == 0, AirwayAnomalyYN == 0,
    # include infants from birth to 4 weeks old
    # there are two outliers with age at surgery > 30 days
    Age_at_Surgery_days <= 30
  ) |>
  # some binary variables have values of 1/2 or Y/N, recode them to 0/1
  mutate(
    Female = as.integer(sex_1_M_2_F == 2),
    Premature = as.integer(Premature == 1),
    Extubation_failure = as.integer(Extubation_failure == "Y"),
  ) |>
  
  # relabel cardiac anatomy
  mutate(
    Cardiac_Anatomy = factor(
      Cardiac_Anatomy, levels = 1:4,
      labels = c(
        "Single ventricle w/o arch obstruction",
        "Single ventricle w/ arch obstruction",
        "Two ventricle w/o arch obstruction",
        "Two ventricle w/ arch obstruction"
      )
    ),
    # for model building purposes, combine the 2 levels w/o arch obstruction
    Cardiac_Anatomy_collapsed = fct_collapse(
      Cardiac_Anatomy,
      "W/o arch obstruction" = c("Single ventricle w/o arch obstruction", "Two ventricle w/o arch obstruction"),
      "Single ventricle w/ arch obstruction" = "Single ventricle w/ arch obstruction",
      "Two ventricle w/ arch obstruction" = "Two ventricle w/ arch obstruction"
    )
  ) |>
  
  # convert date variables to date class
  mutate_at(
    vars("Date_PO_feeds_started", "Date_Reaching_Full_PO", "Date_Identified_as_not_yet_full_PO"), 
    as_date, format = "%m/%d/%Y"
  )

# drop unnecessary variables
nnns <- nnns |> select(!c(
  "sex_1_M_2_F", # use Female instead
  "Intubated_Pre_operatively", "bypass_used", "bypass_time_min", # not of interest 
  "Neurologic_Complication", "AirwayAnomalyYN" # already excluded
)) 

Use ZOIB package

ZOIB Model

\[ f(y_i|\eta_i) = \begin{cases} p_i, & y_i =0 \\ (1-p_i)q_i, & y_i =1 \\ (1 − p_i)(1 − q_i)Beta(\alpha_{i_1}, \alpha_{i_2}) &y_i \in (0, 1) \end{cases} \]

\[ \begin{aligned} logit\left(\mu^{(0,1)} = \frac{\alpha_1}{\alpha_1+ \alpha_2}\right) = \mathbf x_1 \boldsymbol \beta_1 + I_1(\mathbf z_{1,i}\gamma_{1,i}) \\ log\left(v_i = \alpha_1+ \alpha_2\right) = \mathbf x_2 \boldsymbol \beta_2 + I_2(\mathbf z_{2,i}\gamma_{2,i}) \\ logit\left(p_i\right) = \mathbf x_3 \boldsymbol \beta_3 + I_1(\mathbf z_{3,i}\gamma_{3,i}) \\ logit\left(q_i\right) = \mathbf x_4 \boldsymbol \beta_4 + I_1(\mathbf z_{4,i}\gamma_{4 ,i}) \end{aligned} \]

Fit ZOIB model

Code
set.seed(11282023) #Set seed, bayesian model!

tictoc::tic()
model1 =zoib(Percent_of_feeds_taken_by_mouth_at_discharge #Response
            ~Pre_Op_NNNS_attention_score+
              Post_Op_NNNS_attention_score+
              Female+Genetic_Syndrome_or_Chromosomal_Abnormality+
              Age_at_Surgery_days+Cardiac_Anatomy+
              Length_of_intubation_days+Length_of_Stay_days+
              Extubation_failure+GI_Complication+Premature
              #x1 design matrix
       | 1 #x2 design matrix- estimating variance
     |Pre_Op_NNNS_attention_score+
              Post_Op_NNNS_attention_score+
              Female+Genetic_Syndrome_or_Chromosomal_Abnormality+
              Age_at_Surgery_days+Cardiac_Anatomy+
              Length_of_intubation_days+Length_of_Stay_days+
              Extubation_failure+GI_Complication+Premature #x3 design matrix
     |Pre_Op_NNNS_attention_score+
              Post_Op_NNNS_attention_score+
              Female+Genetic_Syndrome_or_Chromosomal_Abnormality+
              Age_at_Surgery_days+Cardiac_Anatomy+
              Length_of_intubation_days+Length_of_Stay_days+
              Extubation_failure+GI_Complication+Premature, #x4 design matrix
     data = nnns,
     n.response = 1,
     zero.inflation = TRUE,
     one.inflation = TRUE,
     link.mu = "logit",
     link.x0 = "logit",
     link.x1 =  "logit",
     random = 0,
     n.chain = 4, # number of MCMC chains
     n.iter = 10000, #number of iterations before burn/thin
     n.thin = 10, # thinning period
     n.burn = 5000, # burn-in period 
     seeds = c(11,29,20,23) #vector of seeds for chains to make reproducable
     )
tictoc::toc()
saveRDS(model1, file="model1.RData")

Interpreting output:

b: vector of estimates from Eqn 1; that is, g(mu) = xb*b +z*gamma

d: vector of estimates from Eqn 2; that is, log(eta) = xd*d+z*gamma

b0: vector of estimates from Eqn 3; that is, g(p0) = x0*b0 +z*gamma

b1: vector of estimates from Eqn ;4 that is, g(p1) = x1*b1+z*gamma

Code
model1 =readRDS(file="model1.RData")

model1$coeff |> summary()

Iterations = 1:500
Thinning interval = 1 
Number of chains = 4 
Sample size per chain = 500 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

             Mean        SD Naive SE Time-series SE
b[1]    -14.54643   3.79282 0.084810       0.112361
b[2]     -0.03687   0.26720 0.005975       0.007431
b[3]      2.11937   0.54855 0.012266       0.016298
b[4]     -0.11380   0.74085 0.016566       0.023209
b[5]     -0.20126   1.02204 0.022854       0.032359
b[6]      0.31640   0.17460 0.003904       0.006149
b[7]      2.46439   0.99358 0.022217       0.032028
b[8]     -1.56983   0.66351 0.014837       0.016602
b[9]     -1.45467   0.70862 0.015845       0.017257
b[10]     0.36189   0.22985 0.005140       0.007906
b[11]    -0.09706   0.06148 0.001375       0.002177
b[12]     1.61259   1.71191 0.038280       0.049590
b[13]    -0.60487   1.68460 0.037669       0.051732
b[14]     3.48052   1.49874 0.033513       0.042725
b0[1]   -31.92729  22.81937 0.510257       3.233061
b0[2]    -2.91548   1.48350 0.033172       0.043964
b0[3]    -0.42010   1.47138 0.032901       0.039653
b0[4]    -0.44539   2.06109 0.046087       0.046822
b0[5]    -1.94152   2.86765 0.064123       0.084092
b0[6]    -1.55378   0.75479 0.016878       0.025536
b0[7]    40.71062  20.73035 0.463545       3.701555
b0[8]    29.64725  20.86880 0.466640       3.261778
b0[9]    40.25138  20.61317 0.460925       3.587458
b0[10]    1.91432   1.04041 0.023264       0.039055
b0[11]    0.25466   0.18990 0.004246       0.005937
b0[12]    0.29247   6.75925 0.151141       0.185871
b0[13]  -11.98803   6.76792 0.151335       0.233134
b0[14]   -0.31578   3.33042 0.074470       0.075875
b1[1]  -246.94180 140.67302 3.145544      12.348793
b1[2]     9.71477  16.24570 0.363265       1.473622
b1[3]    19.30748  26.58829 0.594532       2.515335
b1[4]    27.16566  32.56935 0.728273       2.517479
b1[5]   -52.73324  41.81370 0.934983       3.222738
b1[6]     7.79613   4.82919 0.107984       0.466384
b1[7]    20.31243  49.05669 1.096941       5.049487
b1[8]   -37.15633  45.06154 1.007607       3.822780
b1[9]    50.27294  33.73077 0.754243       2.878735
b1[10]    1.97353   9.57740 0.214157       0.861758
b1[11]   -0.30451   1.63862 0.036641       0.134717
b1[12]   18.69918  76.09921 1.701630       5.621423
b1[13]  -73.30273  68.26586 1.526471       6.202346
b1[14]  -14.25481  66.51609 1.487345       6.268917
d         2.33198   0.47767 0.010681       0.010777

2. Quantiles for each variable:

             2.5%       25%        50%        75%     97.5%
b[1]    -21.98595  -16.8586  -14.62101  -12.23765  -6.81435
b[2]     -0.55928   -0.2062   -0.03942    0.13607   0.46849
b[3]      0.96452    1.7884    2.11214    2.45876   3.22496
b[4]     -1.51925   -0.5849   -0.12952    0.33908   1.41166
b[5]     -2.31980   -0.8449   -0.16481    0.44366   1.80340
b[6]     -0.04951    0.2049    0.31708    0.42510   0.66155
b[7]      0.65760    1.8160    2.44321    3.08921   4.52007
b[8]     -2.77039   -2.0152   -1.60031   -1.15968  -0.17322
b[9]     -2.84112   -1.9140   -1.45926   -1.00895  -0.01867
b[10]    -0.09425    0.2140    0.36381    0.50881   0.81918
b[11]    -0.21940   -0.1362   -0.09563   -0.05761   0.02343
b[12]    -2.01627    0.6105    1.63017    2.67430   4.82337
b[13]    -4.16874   -1.6223   -0.53803    0.49908   2.62893
b[14]     0.75630    2.4433    3.42364    4.48379   6.51000
b0[1]   -85.17524  -45.0485  -29.19349  -15.52361   4.39287
b0[2]    -6.03695   -3.8312   -2.81648   -1.89994  -0.22524
b0[3]    -3.45882   -1.3550   -0.38895    0.50419   2.43841
b0[4]    -4.51558   -1.8420   -0.44420    0.93104   3.54290
b0[5]    -7.82432   -3.7598   -1.80934    0.12009   3.24410
b0[6]    -3.24375   -2.0276   -1.48691   -1.02016  -0.26755
b0[7]    11.46390   24.9428   37.00649   52.07197  89.70499
b0[8]    -0.59272   14.1864   26.01167   41.52379  79.43970
b0[9]    11.22946   24.6897   36.52107   51.38559  88.94062
b0[10]    0.22011    1.1771    1.78536    2.53813   4.27730
b0[11]   -0.09228    0.1268    0.24428    0.37032   0.65696
b0[12]  -13.61180   -3.7584    0.16828    4.68888  13.50700
b0[13]  -26.99021  -15.8957  -11.18311   -7.22033  -1.09040
b0[14]   -7.04035   -2.4400   -0.20221    1.92480   6.06073
b1[1]  -555.48649 -338.5155 -237.70385 -145.17682  -2.87013
b1[2]   -21.51656   -0.8320    9.39656   20.46622  42.86643
b1[3]   -34.15392    1.3824   19.55716   37.91474  67.89801
b1[4]   -29.65046    4.9261   23.87769   47.31136  95.48295
b1[5]  -145.85491  -78.1293  -47.40276  -23.85150  16.10922
b1[6]    -0.32582    4.2574    7.43429   10.84467  17.94451
b1[7]   -79.16618  -13.0682   19.90946   52.82692 119.14744
b1[8]  -127.64679  -68.0751  -35.85644   -5.08605  45.37185
b1[9]   -10.98184   27.3263   47.85364   71.25838 125.73851
b1[10]  -15.56388   -4.6997    1.64875    8.23691  21.43392
b1[11]   -3.58036   -1.3097   -0.30256    0.78924   2.90186
b1[12] -153.65331  -27.8863   22.67187   73.98942 150.68028
b1[13] -213.09062 -117.9273  -72.54984  -27.02958  57.73211
b1[14] -142.41862  -61.1852  -12.44624   29.51041 126.59074
d         1.31880    2.0291    2.35622    2.65517   3.19815

Check convergence

Traceplots

Code
traceplot(model1$coeff)

Autocorrelation plots

Code
autocorr.plot(model1$coeff)

Gelman Plot

Code
gelman.diag(model1$coeff)
Potential scale reduction factors:

       Point est. Upper C.I.
b[1]        1.013       1.04
b[2]        1.005       1.02
b[3]        1.007       1.03
b[4]        1.014       1.05
b[5]        1.026       1.08
b[6]        1.013       1.04
b[7]        1.008       1.03
b[8]        1.003       1.01
b[9]        1.000       1.00
b[10]       1.029       1.08
b[11]       1.032       1.09
b[12]       1.027       1.07
b[13]       1.009       1.03
b[14]       1.009       1.03
b0[1]       1.344       1.91
b0[2]       1.004       1.01
b0[3]       1.000       1.00
b0[4]       1.002       1.01
b0[5]       1.002       1.01
b0[6]       1.004       1.01
b0[7]       1.454       2.27
b0[8]       1.454       2.24
b0[9]       1.451       2.25
b0[10]      0.999       1.00
b0[11]      1.004       1.01
b0[12]      1.005       1.02
b0[13]      1.001       1.01
b0[14]      1.001       1.00
b1[1]       1.007       1.01
b1[2]       1.046       1.11
b1[3]       1.026       1.07
b1[4]       1.024       1.07
b1[5]       1.018       1.04
b1[6]       1.028       1.08
b1[7]       1.023       1.04
b1[8]       1.056       1.15
b1[9]       1.021       1.06
b1[10]      1.009       1.03
b1[11]      1.048       1.13
b1[12]      1.093       1.25
b1[13]      1.024       1.07
b1[14]      1.066       1.19
d           1.006       1.02

Multivariate psrf

1.36